By Franklin Oliveira
This notebook contains all code necessary to make the "type" charts from crustacea database. Here you'll find some basic data treatment and charts' code.
Database: 'Planilha geral Atualizda FINAL com correções.xlsx'.
import datetime
import numpy as np
import pandas as pd
from collections import defaultdict
# quick visualizations
import matplotlib.pyplot as plt
# Waffle Charts
from pywaffle import Waffle
# docs: https://pywaffle.readthedocs.io/en/latest/examples/block_shape_distance_location_and_direction.html
# vizualization
import altair as alt
# enabling notebook renderer
alt.renderers.enable('notebook')
# disabling rows limit
alt.data_transformers.disable_max_rows()
excel = pd.ExcelFile('Planilha geral Atualizda FINAL com correções.xlsx')
sheet_name = excel.sheet_names
print('The excel file contains the following sheets:', sheet_name)
print('\nDatabase is in sheet:', sheet_name[0])
# db = excel.parse(sheet_name[0], sep=';', encoding='utf-8-sig')
db = pd.read_csv('db.csv', sep=',', encoding='utf-8-sig', low_memory=False) # db pre-treated by Asla
print(f'The database has {db.shape[0]} rows and {db.shape[1]} columns.')
# copying database to another variable to make a few changes keeping the original intact
crustaceas = db.copy()
Determiner First Name1 e Determiner Last Name1
In this step, we're going to put together determiner's first and last name.
Determiner First Name1 + Determiner Last Name1
Just concatenating those two columns is not satisfactory because there's some dirt in these data columns. So, let's begin by treating them...
def treat_names(name, pos='first'):
'''
Treat names keeping NaN as such.
Arguments:
- name: name to be treated.
- pos (str): name position. One of ['first', 'last']
'''
if type(name) == str and pos == 'first': # first name
if len(name.split(' ')) > 1: # checks if it's a composite name (+ 1 name)
return str(name).strip().split(' ')[0].capitalize() # keeps only 1st name
else:
return str(name).strip().capitalize()
elif type(name) == str and pos == 'last': # last name
if len(name.split(' ')) > 1: # treats composite name (+ 1 surname)
return str(name).strip().split(' ')[-1].capitalize()
else:
return str(name).strip().capitalize()
else:
return name
applying function to the two column names
crustaceas['Determiner First Name1'] = crustaceas['Determiner First Name1'].apply(treat_names)
# crustaceas['Determiner First Name1'].value_counts()
crustaceas['Determiner Last Name1'] = crustaceas['Determiner Last Name1'].apply(lambda x:
treat_names(x, pos='last'))
# crustaceas['Determiner Last Name1'].value_counts()
# creating column with First and Last name for identification
crustaceas['Determiner First_and_Last Name'] = crustaceas['Determiner First Name1'] + ' ' + crustaceas['Determiner Last Name1']
crustaceas.columns = [str(col).replace(r'\n','') for col in crustaceas.columns]
Type Status1¶crustaceas['Type Status1'].value_counts().head()
crustaceas['Type Status1'] = crustaceas['Type Status1'].str.strip().str.lower().str.capitalize()
crustaceas['Type Status1'].value_counts().head()
Kingdom - Phylum1 - Subphylum1 - Class1 - Subclass1 - Infraclass1 - Superorder1 - Order1 - Suborder1 - Infraorder1 - Superfamily1 - Family1 - Subfamily1 - Tribe1 - Genus1 - Species1
taxon_columns = crustaceas.columns[21:37] # selecting taxonomy columns
# defining function
def treat_str(x):
return str(x).lower().capitalize().strip()
# applying treatment
for col in taxon_columns:
print(f'Adjusting column {col}')
crustaceas[col] = crustaceas[col].apply(treat_str)
Genus1 and Species1together (they completely identify each animal's species)¶crustaceas['genus_and_species'] = crustaceas['Genus1'] + ' ' + crustaceas['Species1']
crustaceas['genus_and_species'] = crustaceas['genus_and_species'].str.lower().str.capitalize()
Columns: Determined Date1 - Class1 - Kingdom and more...
# slicing main database (crustaceas)
Table = crustaceas[['Cataloged Date','Determined Date1','Start Date','Class1','Kingdom',
'Genus1', 'Species1', 'Type Status1',
'Determiner First_and_Last Name', 'Determiner First Name1',
'Determiner Last Name1', 'genus_and_species','Min Depth', 'Count 1',
'Species Author1', 'Order1', 'Suborder1', 'Infraorder1',
'Superfamily1', 'Family1', 'Subfamily1', 'Tribe1']].copy()
# p.s.: Determined Date1 has many missing values... CHECK THAT
d = []
counter=0
for row in Table['Cataloged Date']:
if not str(row).find('/')==-1:
dates_values = str(row).split("/")
year = int(dates_values[0])
month = int(dates_values[1])
if (month>1) and (month<12):
#store the year and month in a datetime datatype for later sorting
dateRecord = datetime.datetime(year,month,1)
d.append({'cataloged_year':year, 'count':Table.loc[counter,'Count 1'],
'class':Table.loc[counter,'Class1'],
'kingdom':Table.loc[counter,'Kingdom'], 'genus':Table.loc[counter,'Genus1'],
'species':Table.loc[counter,'Species1'],
'genus_and_species': Table.loc[counter,'genus_and_species'],
'type_status':Table.loc[counter,'Type Status1'],
'determiner_first_and_last_name':Table.loc[counter,'Determiner First_and_Last Name'],
'determiner_first_name':Table.loc[counter,'Determiner First Name1'],
'determiner_last_name':Table.loc[counter,'Determiner Last Name1'],
'depth':Table.loc[counter,'Min Depth'],
'species_author':Table.loc[counter,'Species Author1'],
'order':Table.loc[counter,'Order1'],
'suborder':Table.loc[counter,'Suborder1'],
'infraorder':Table.loc[counter, 'Infraorder1'],
'superfamily':Table.loc[counter,'Superfamily1'],
'family':Table.loc[counter,'Family1'],
'subfamily':Table.loc[counter,'Subfamily1'],
'tribe':Table.loc[counter,'Tribe1']
})
counter = counter+1
NewTable = pd.DataFrame(d)
### collecting determined year (p.s.: being careful to keep NaNs as they show up)
NewTable['determined_year'] = np.nan
d1 = []
counter=0
for row in Table['Determined Date1']:
try: # if Determined Date1 is empty, keep it so
if np.isnan(row):
year= np.NAN
except:
if not str(row).find('/')==-1:
dates_values = str(row).split("/")
year = int(dates_values[0])
month = int(dates_values[1])
if (month>1) and (month<12):
#store the year and month in a datetime datatype for later sorting
dateRecord = datetime.datetime(year,month,1)
NewTable.loc[counter, 'determined_year'] = year
counter = counter+1
### collecting start year (p.s.: being careful to keep NaNs as they show up)
NewTable['start_year'] = np.nan
d1 = []
counter=0
for row in Table['Start Date']:
try: # if Start Date is empty, keep it so
if np.isnan(row):
year= np.NAN
except:
if not str(row).find('/')==-1:
dates_values = str(row).split("/")
year = int(dates_values[0])
month = int(dates_values[1])
if (month>1) and (month<12):
#store the year and month in a datetime datatype for later sorting
dateRecord = datetime.datetime(year,month,1)
NewTable.loc[counter, 'start_year'] = year
counter = counter+1
# NewTable['determined_year'] = pd.Series(year, index=NewTable.index)
NewTable.head(2)
# checks if NaNs are in the same position
result = (NewTable['start_year'].isna() == crustaceas['Start Date'].isna()).sum() == NewTable.shape[0]
if result:
print('Start year info is valid.')
else:
print("There's something wrong with NewTable. Check how you're collecting Start Year info.")
# year in which the holotipo was "firstly" found
NewTable['holotipo_year'] = NewTable['species_author'].str.extract('(\d+)')
years columns in crustaceas¶def catch_year(row):
if not str(row).find('/')==-1:
dates_values = str(row).split("/")
year = int(dates_values[0])
month = int(dates_values[1])
return year
else:
return np.NaN
crustaceas['determined_year'] = crustaceas['Determined Date1'].apply(catch_year)
crustaceas['start_year'] = crustaceas['Start Date'].apply(catch_year)
crustaceas['cataloged_year'] = crustaceas['Cataloged Date'].apply(catch_year)
crustaceas['holotipo_year'] = crustaceas['Species Author1'].str.extract('(\d+)')
determined_year, cataloged_year and start_year to *int* format¶def str_with_nan2int(string):
if not np.isnan(string):
return int(string)
else:
return np.NAN
NewTable['determined_year'] = NewTable['determined_year'].apply(str_with_nan2int) #has NaN
# NewTable['determined_year'] = NewTable['determined_year'].astype(int)
NewTable['cataloged_year'] = NewTable['cataloged_year'].astype(int)
NewTable['start_year'] = NewTable['start_year'].apply(str_with_nan2int) #has NaN
teste1 = NewTable.drop_duplicates().copy() # remvoves duplicated rows (with same values in ALL columns)
print('Duplicated registers:',NewTable.shape[0] - NewTable.drop_duplicates().shape[0])
# teste1.head(2)
teste1.sort_values(['start_year','determiner_first_and_last_name'], inplace=True)
# names ordering for y axis
names_ordering = list(
teste1[~teste1['determiner_first_and_last_name'].isna()]['determiner_first_and_last_name'].unique()
)
g1 = alt.Chart(teste1, width=800, height=2200).mark_circle().encode(
x= alt.X('start_year', type='ordinal', title='Year'),
y= alt.Y('determiner_first_and_last_name', type='nominal', title='Determiner Name',
sort= names_ordering),
color= alt.Color('type_status')
)
# saving chart
# g1.save('./types/per_researcher/1-determiner_and_year-null_name_and_null_start_date.svg')
# g1.save('./types/per_researcher/1-determiner_and_year-null_name_and_null_start_date.png')
# g1.save('./types/per_researcher/1-determiner_and_year-null_name_and_null_start_date.html')
g1